We download the data for AirBnb in Rio
listings <- vroom("http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2020-06-19/data/listings.csv.gz") %>%
clean_names()
Quick description of some of the variables collected, with cost data typically expressed in US$
# We try to concert to numerical variable when it is possible
numeric_listings <- listings %>%
mutate(price = parse_number(price),
cleaning_fee = parse_number(cleaning_fee),
extra_people = parse_number(extra_people),
weekly_price = parse_number(weekly_price),
monthly_price = parse_number(monthly_price),
security_deposit = parse_number(security_deposit)
)
We will analyse the raw values
dplyr::glimpse(listings)
## Rows: 35,731
## Columns: 106
## $ id <dbl> 17878, 21280, 25026, 315…
## $ listing_url <chr> "https://www.airbnb.com/…
## $ scrape_id <dbl> 2.020062e+13, 2.020062e+…
## $ last_scraped <date> 2020-06-19, 2020-06-19,…
## $ name <chr> "Very Nice 2Br in Copaca…
## $ summary <chr> "Discounts for long term…
## $ space <chr> "- Beautiful, sunny 2 be…
## $ description <chr> "Discounts for long term…
## $ experiences_offered <chr> "none", "none", "none", …
## $ neighborhood_overview <chr> "This is the one of the …
## $ notes <chr> NA, NA, "For any stay su…
## $ transit <chr> "Excellent location. Clo…
## $ access <chr> "The entire apartment is…
## $ interaction <chr> "I will be available thr…
## $ house_rules <chr> "Please leave the apartm…
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url <chr> "https://a0.muscache.com…
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id <dbl> 68997, 81163, 102840, 13…
## $ host_url <chr> "https://www.airbnb.com/…
## $ host_name <chr> "Matthias", "Jules", "Vi…
## $ host_since <date> 2010-01-08, 2010-02-14,…
## $ host_location <chr> "Rio de Janeiro, State o…
## $ host_about <chr> "I am a journalist/wri…
## $ host_response_time <chr> "within an hour", "withi…
## $ host_response_rate <chr> "100%", "100%", "70%", "…
## $ host_acceptance_rate <chr> "100%", "90%", "77%", "1…
## $ host_is_superhost <lgl> TRUE, FALSE, FALSE, TRUE…
## $ host_thumbnail_url <chr> "https://a0.muscache.com…
## $ host_picture_url <chr> "https://a0.muscache.com…
## $ host_neighbourhood <chr> "Copacabana", "Ipanema",…
## $ host_listings_count <dbl> 2, 0, 3, 1, 1, 1, 7, 2, …
## $ host_total_listings_count <dbl> 2, 0, 3, 1, 1, 1, 7, 2, …
## $ host_verifications <chr> "['email', 'phone', 'rev…
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ street <chr> "Rio de Janeiro, Rio de …
## $ neighbourhood <chr> "Copacabana", "Ipanema",…
## $ neighbourhood_cleansed <chr> "Copacabana", "Ipanema",…
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, …
## $ city <chr> "Rio de Janeiro", "Rio d…
## $ state <chr> "Rio de Janeiro", "RJ", …
## $ zipcode <chr> "22020-050", "22420-010"…
## $ market <chr> "Rio De Janeiro", "Rio D…
## $ smart_location <chr> "Rio de Janeiro, Brazil"…
## $ country_code <chr> "BR", "BR", "BR", "BR", …
## $ country <chr> "Brazil", "Brazil", "Bra…
## $ latitude <dbl> -22.96592, -22.98467, -2…
## $ longitude <dbl> -43.17896, -43.19611, -4…
## $ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ property_type <chr> "Condominium", "Apartmen…
## $ room_type <chr> "Entire home/apt", "Enti…
## $ accommodates <dbl> 5, 6, 2, 3, 2, 2, 13, 1,…
## $ bathrooms <dbl> 1.0, 2.0, 1.0, 1.0, 1.5,…
## $ bedrooms <dbl> 2, 2, 1, 1, 1, 1, 6, 1, …
## $ beds <dbl> 2, 4, 2, 1, 1, 1, 6, 1, …
## $ bed_type <chr> "Real Bed", "Real Bed", …
## $ amenities <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet <dbl> NA, NA, NA, NA, NA, 0, N…
## $ price <chr> "$191.00", "$424.00", "$…
## $ weekly_price <chr> NA, "$4,945.00", NA, NA,…
## $ monthly_price <chr> NA, "$17,455.00", NA, NA…
## $ security_deposit <chr> "$0.00", "$2,647.00", "$…
## $ cleaning_fee <chr> "$250.00", "$265.00", "$…
## $ guests_included <dbl> 2, 6, 2, 2, 2, 2, 7, 1, …
## $ extra_people <chr> "$0.00", "$0.00", "$45.0…
## $ minimum_nights <dbl> 7, 5, 7, 2, 2, 3, 2, 3, …
## $ maximum_nights <dbl> 180, 30, 60, 1125, 89, 2…
## $ minimum_minimum_nights <dbl> 7, 5, 7, 2, 2, 3, 2, 3, …
## $ maximum_minimum_nights <dbl> 7, 5, 7, 2, 2, 3, 2, 3, …
## $ minimum_maximum_nights <dbl> 1125, 30, 60, 1125, 89, …
## $ maximum_maximum_nights <dbl> 1125, 30, 60, 1125, 89, …
## $ minimum_nights_avg_ntm <dbl> 7.0, 5.0, 7.0, 2.0, 2.0,…
## $ maximum_nights_avg_ntm <dbl> 1125, 30, 60, 1125, 89, …
## $ calendar_updated <chr> "4 months ago", "4 month…
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30 <dbl> 0, 19, 0, 0, 0, 23, 27, …
## $ availability_60 <dbl> 6, 49, 0, 0, 0, 52, 54, …
## $ availability_90 <dbl> 36, 79, 0, 0, 0, 71, 84,…
## $ availability_365 <dbl> 287, 93, 154, 0, 170, 12…
## $ calendar_last_scraped <date> 2020-06-19, 2020-06-19,…
## $ number_of_reviews <dbl> 251, 94, 238, 282, 181, …
## $ number_of_reviews_ltm <dbl> 20, 6, 7, 25, 23, 33, 22…
## $ first_review <date> 2010-07-15, 2014-02-14,…
## $ last_review <date> 2020-04-06, 2020-03-22,…
## $ review_scores_rating <dbl> 93, 97, 94, 96, 94, 98, …
## $ review_scores_accuracy <dbl> 9, 10, 9, 10, 10, 10, 10…
## $ review_scores_cleanliness <dbl> 10, 10, 9, 10, 9, 10, 9,…
## $ review_scores_checkin <dbl> 10, 10, 9, 10, 10, 10, 1…
## $ review_scores_communication <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_location <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_value <dbl> 9, 10, 9, 10, 9, 10, 9, …
## $ requires_license <lgl> FALSE, FALSE, FALSE, FAL…
## $ license <lgl> NA, NA, NA, NA, NA, NA, …
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable <lgl> TRUE, FALSE, FALSE, TRUE…
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy <chr> "moderate", "strict_14_w…
## $ require_guest_profile_picture <lgl> FALSE, FALSE, TRUE, FALS…
## $ require_guest_phone_verification <lgl> FALSE, FALSE, TRUE, FALS…
## $ calculated_host_listings_count <dbl> 1, 1, 3, 1, 1, 1, 5, 1, …
## $ calculated_host_listings_count_entire_homes <dbl> 1, 1, 3, 1, 1, 1, 3, 0, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 2, 1, …
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month <dbl> 2.08, 1.22, 1.95, 2.33, …
There are 106 columns with 35,731 rows. On all this data, we have 46 columns with character variables, 5 with dates, 16 with logical and 39 columns with numbers.
Computing summary statistics of the variables of interest, or finding NAs
skimr::skim(listings)
| Name | listings |
| Number of rows | 35731 |
| Number of columns | 106 |
| _______________________ | |
| Column type frequency: | |
| character | 46 |
| Date | 5 |
| logical | 16 |
| numeric | 39 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 34 | 37 | 0 | 35731 | 0 |
| name | 58 | 1.00 | 1 | 255 | 0 | 34488 | 0 |
| summary | 2588 | 0.93 | 1 | 1000 | 0 | 31747 | 0 |
| space | 14186 | 0.60 | 1 | 1000 | 0 | 20709 | 0 |
| description | 1433 | 0.96 | 1 | 1000 | 0 | 33409 | 0 |
| experiences_offered | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| neighborhood_overview | 15776 | 0.56 | 1 | 1000 | 0 | 18303 | 0 |
| notes | 24127 | 0.32 | 1 | 1000 | 0 | 10380 | 0 |
| transit | 15967 | 0.55 | 1 | 1000 | 0 | 18237 | 0 |
| access | 19597 | 0.45 | 1 | 1000 | 0 | 14325 | 0 |
| interaction | 18251 | 0.49 | 1 | 1000 | 0 | 15456 | 0 |
| house_rules | 17304 | 0.52 | 1 | 1000 | 0 | 16288 | 0 |
| picture_url | 0 | 1.00 | 81 | 146 | 0 | 35146 | 0 |
| host_url | 0 | 1.00 | 39 | 43 | 0 | 24992 | 0 |
| host_name | 5 | 1.00 | 1 | 35 | 0 | 6429 | 0 |
| host_location | 168 | 1.00 | 2 | 253 | 0 | 1180 | 0 |
| host_about | 18793 | 0.47 | 1 | 8586 | 0 | 9714 | 22 |
| host_response_time | 5 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 5 | 1.00 | 2 | 4 | 0 | 63 | 0 |
| host_acceptance_rate | 5 | 1.00 | 2 | 4 | 0 | 100 | 0 |
| host_thumbnail_url | 5 | 1.00 | 55 | 106 | 0 | 24859 | 0 |
| host_picture_url | 5 | 1.00 | 57 | 109 | 0 | 24859 | 0 |
| host_neighbourhood | 13638 | 0.62 | 3 | 33 | 0 | 164 | 0 |
| host_verifications | 0 | 1.00 | 2 | 158 | 0 | 345 | 0 |
| street | 0 | 1.00 | 10 | 112 | 0 | 502 | 0 |
| neighbourhood | 1892 | 0.95 | 3 | 24 | 0 | 101 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 3 | 24 | 0 | 156 | 0 |
| city | 162 | 1.00 | 1 | 69 | 0 | 317 | 0 |
| state | 35 | 1.00 | 1 | 51 | 0 | 52 | 0 |
| zipcode | 1752 | 0.95 | 1 | 24 | 0 | 4261 | 1 |
| market | 43 | 1.00 | 4 | 21 | 0 | 11 | 0 |
| smart_location | 0 | 1.00 | 9 | 77 | 0 | 341 | 0 |
| country_code | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| country | 0 | 1.00 | 6 | 6 | 0 | 1 | 0 |
| property_type | 0 | 1.00 | 3 | 22 | 0 | 36 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bed_type | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| amenities | 0 | 1.00 | 2 | 1267 | 0 | 33605 | 0 |
| price | 0 | 1.00 | 5 | 11 | 0 | 1117 | 0 |
| weekly_price | 33335 | 0.07 | 7 | 11 | 0 | 758 | 0 |
| monthly_price | 33070 | 0.07 | 7 | 11 | 0 | 861 | 0 |
| security_deposit | 15680 | 0.56 | 5 | 10 | 0 | 737 | 0 |
| cleaning_fee | 11584 | 0.68 | 5 | 9 | 0 | 461 | 0 |
| extra_people | 0 | 1.00 | 5 | 9 | 0 | 334 | 0 |
| calendar_updated | 0 | 1.00 | 5 | 14 | 0 | 90 | 0 |
| cancellation_policy | 0 | 1.00 | 6 | 27 | 0 | 6 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2020-06-19 | 2020-06-21 | 2020-06-19 | 3 |
| host_since | 5 | 1.00 | 2009-03-29 | 2020-06-17 | 2016-01-26 | 3301 |
| calendar_last_scraped | 0 | 1.00 | 2020-06-19 | 2020-06-21 | 2020-06-19 | 3 |
| first_review | 14991 | 0.58 | 2010-06-07 | 2020-06-18 | 2018-02-14 | 2506 |
| last_review | 14991 | 0.58 | 2012-02-21 | 2020-06-19 | 2020-01-31 | 1529 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| thumbnail_url | 35731 | 0 | NaN | : |
| medium_url | 35731 | 0 | NaN | : |
| xl_picture_url | 35731 | 0 | NaN | : |
| host_is_superhost | 5 | 1 | 0.13 | FAL: 31028, TRU: 4698 |
| host_has_profile_pic | 5 | 1 | 1.00 | TRU: 35588, FAL: 138 |
| host_identity_verified | 5 | 1 | 0.27 | FAL: 26186, TRU: 9540 |
| neighbourhood_group_cleansed | 35731 | 0 | NaN | : |
| is_location_exact | 0 | 1 | 0.69 | TRU: 24496, FAL: 11235 |
| has_availability | 0 | 1 | 1.00 | TRU: 35731 |
| requires_license | 0 | 1 | 0.00 | FAL: 35731 |
| license | 35731 | 0 | NaN | : |
| jurisdiction_names | 35731 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.40 | FAL: 21360, TRU: 14371 |
| is_business_travel_ready | 0 | 1 | 0.00 | FAL: 35731 |
| require_guest_profile_picture | 0 | 1 | 0.02 | FAL: 35176, TRU: 555 |
| require_guest_phone_verification | 0 | 1 | 0.02 | FAL: 35177, TRU: 554 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2.131976e+07 | 13709734.81 | 1.787800e+04 | 1.098372e+07 | 1.707225e+07 | 3.428583e+07 | 4.384991e+07 | ▅▇▃▃▆ |
| scrape_id | 0 | 1.00 | 2.020062e+13 | 0.00 | 2.020062e+13 | 2.020062e+13 | 2.020062e+13 | 2.020062e+13 | 2.020062e+13 | ▁▁▇▁▁ |
| host_id | 0 | 1.00 | 8.502863e+07 | 89449682.73 | 1.173900e+04 | 1.502522e+07 | 5.623333e+07 | 1.122780e+08 | 3.505372e+08 | ▇▃▁▁▁ |
| host_listings_count | 5 | 1.00 | 3.576000e+01 | 312.28 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.000000e+00 | 3.316000e+03 | ▇▁▁▁▁ |
| host_total_listings_count | 5 | 1.00 | 3.576000e+01 | 312.28 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.000000e+00 | 3.316000e+03 | ▇▁▁▁▁ |
| latitude | 0 | 1.00 | -2.297000e+01 | 0.03 | -2.307000e+01 | -2.298000e+01 | -2.297000e+01 | -2.295000e+01 | -2.275000e+01 | ▁▇▃▁▁ |
| longitude | 0 | 1.00 | -4.325000e+01 | 0.10 | -4.374000e+01 | -4.332000e+01 | -4.320000e+01 | -4.319000e+01 | -4.310000e+01 | ▁▁▂▂▇ |
| accommodates | 0 | 1.00 | 4.180000e+00 | 2.63 | 1.000000e+00 | 2.000000e+00 | 4.000000e+00 | 5.000000e+00 | 1.600000e+02 | ▇▁▁▁▁ |
| bathrooms | 67 | 1.00 | 1.670000e+00 | 1.05 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 2.000000e+01 | ▇▁▁▁▁ |
| bedrooms | 79 | 1.00 | 1.620000e+00 | 1.09 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.000000e+01 | ▇▁▁▁▁ |
| beds | 331 | 0.99 | 2.540000e+00 | 2.08 | 0.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.200000e+01 | ▇▁▁▁▁ |
| square_feet | 35279 | 0.01 | 5.252900e+02 | 983.16 | 0.000000e+00 | 0.000000e+00 | 1.940000e+02 | 7.530000e+02 | 1.076400e+04 | ▇▁▁▁▁ |
| guests_included | 0 | 1.00 | 1.730000e+00 | 1.60 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.600000e+01 | ▇▁▁▁▁ |
| minimum_nights | 0 | 1.00 | 4.910000e+00 | 22.27 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 4.000000e+00 | 1.123000e+03 | ▇▁▁▁▁ |
| maximum_nights | 0 | 1.00 | 2.865507e+04 | 5290264.02 | 1.000000e+00 | 3.000000e+01 | 1.125000e+03 | 1.125000e+03 | 1.000000e+09 | ▇▁▁▁▁ |
| minimum_minimum_nights | 0 | 1.00 | 4.780000e+00 | 21.18 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 4.000000e+00 | 1.095000e+03 | ▇▁▁▁▁ |
| maximum_minimum_nights | 0 | 1.00 | 5.120000e+00 | 21.42 | 1.000000e+00 | 1.000000e+00 | 3.000000e+00 | 5.000000e+00 | 1.095000e+03 | ▇▁▁▁▁ |
| minimum_maximum_nights | 0 | 1.00 | 2.873711e+04 | 5290263.58 | 1.000000e+00 | 4.000000e+01 | 1.125000e+03 | 1.125000e+03 | 1.000000e+09 | ▇▁▁▁▁ |
| maximum_maximum_nights | 0 | 1.00 | 2.874036e+04 | 5290263.56 | 1.000000e+00 | 4.400000e+01 | 1.125000e+03 | 1.125000e+03 | 1.000000e+09 | ▇▁▁▁▁ |
| minimum_nights_avg_ntm | 0 | 1.00 | 4.890000e+00 | 21.29 | 1.000000e+00 | 1.000000e+00 | 2.100000e+00 | 4.000000e+00 | 1.095000e+03 | ▇▁▁▁▁ |
| maximum_nights_avg_ntm | 0 | 1.00 | 2.873929e+04 | 5290263.57 | 1.000000e+00 | 4.350000e+01 | 1.125000e+03 | 1.125000e+03 | 1.000000e+09 | ▇▁▁▁▁ |
| availability_30 | 0 | 1.00 | 1.654000e+01 | 13.92 | 0.000000e+00 | 0.000000e+00 | 2.700000e+01 | 3.000000e+01 | 3.000000e+01 | ▆▁▁▁▇ |
| availability_60 | 0 | 1.00 | 3.439000e+01 | 27.90 | 0.000000e+00 | 0.000000e+00 | 5.400000e+01 | 5.900000e+01 | 6.000000e+01 | ▆▁▁▁▇ |
| availability_90 | 0 | 1.00 | 5.255000e+01 | 41.77 | 0.000000e+00 | 0.000000e+00 | 8.300000e+01 | 8.900000e+01 | 9.000000e+01 | ▆▁▁▁▇ |
| availability_365 | 0 | 1.00 | 1.723200e+02 | 154.74 | 0.000000e+00 | 0.000000e+00 | 1.680000e+02 | 3.620000e+02 | 3.650000e+02 | ▇▃▂▁▇ |
| number_of_reviews | 0 | 1.00 | 9.630000e+00 | 25.64 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 6.000000e+00 | 4.060000e+02 | ▇▁▁▁▁ |
| number_of_reviews_ltm | 0 | 1.00 | 3.060000e+00 | 6.93 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.000000e+00 | 1.050000e+02 | ▇▁▁▁▁ |
| review_scores_rating | 15873 | 0.56 | 9.458000e+01 | 9.58 | 2.000000e+01 | 9.300000e+01 | 9.800000e+01 | 1.000000e+02 | 1.000000e+02 | ▁▁▁▁▇ |
| review_scores_accuracy | 15892 | 0.56 | 9.640000e+00 | 0.92 | 2.000000e+00 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | ▁▁▁▁▇ |
| review_scores_cleanliness | 15887 | 0.56 | 9.390000e+00 | 1.13 | 2.000000e+00 | 9.000000e+00 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | ▁▁▁▁▇ |
| review_scores_checkin | 15893 | 0.56 | 9.810000e+00 | 0.72 | 2.000000e+00 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | ▁▁▁▁▇ |
| review_scores_communication | 15885 | 0.56 | 9.760000e+00 | 0.81 | 2.000000e+00 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | ▁▁▁▁▇ |
| review_scores_location | 15891 | 0.56 | 9.750000e+00 | 0.74 | 2.000000e+00 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | ▁▁▁▁▇ |
| review_scores_value | 15889 | 0.56 | 9.290000e+00 | 1.06 | 2.000000e+00 | 9.000000e+00 | 1.000000e+01 | 1.000000e+01 | 1.000000e+01 | ▁▁▁▁▇ |
| calculated_host_listings_count | 0 | 1.00 | 8.070000e+00 | 34.23 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.000000e+00 | 3.190000e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 7.200000e+00 | 33.90 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 3.170000e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 7.500000e-01 | 1.59 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 2.000000e+01 | ▇▁▁▁▁ |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 9.000000e-02 | 0.65 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.100000e+01 | ▇▁▁▁▁ |
| reviews_per_month | 14991 | 0.58 | 5.500000e-01 | 0.73 | 1.000000e-02 | 1.100000e-01 | 2.600000e-01 | 6.800000e-01 | 8.550000e+00 | ▇▁▁▁▁ |
Below we analyze which variables are numbers
numeric_listings <- numeric_listings %>%
select(price,
weekly_price,
monthly_price,
security_deposit,
cleaning_fee,
guests_included,
extra_people,
property_type,
room_type,
accommodates,
bathrooms,
bedrooms,
beds,
square_feet,
neighbourhood,
neighbourhood_cleansed,
neighbourhood_group_cleansed,
is_location_exact,
latitude,
longitude,
number_of_reviews,
number_of_reviews_ltm,
review_scores_rating,
minimum_nights,
maximum_nights,
is_business_travel_ready,
cancellation_policy,
host_is_superhost,
host_identity_verified,
listing_url
)
We find the missing values after having computed summary statistics of the variable of interest
numeric_listings <- numeric_listings%>%
mutate(neighbourhood = as.factor(neighbourhood),
neighbourhood_cleansed = as.factor(neighbourhood_cleansed),
neighbourhood_group_cleansed = as.factor(neighbourhood_group_cleansed),
room_type = as.factor(room_type),
cancellation_policy = as.factor(cancellation_policy),
host_is_superhost = as.factor(host_is_superhost))
summary(numeric_listings)
## price weekly_price monthly_price security_deposit
## Min. : 0.0 Min. : 110 Min. : 360 Min. : 0
## 1st Qu.: 155.0 1st Qu.: 1120 1st Qu.: 3444 1st Qu.: 0
## Median : 300.0 Median : 2000 Median : 6000 Median : 500
## Mean : 785.9 Mean : 4361 Mean : 16609 Mean : 1191
## 3rd Qu.: 642.0 3rd Qu.: 4818 3rd Qu.: 16000 3rd Qu.: 1000
## Max. :132358.0 Max. :133832 Max. :582374 Max. :79415
## NA's :33335 NA's :33070 NA's :15680
## cleaning_fee guests_included extra_people property_type
## Min. : 0.0 Min. : 1.000 Min. : 0.00 Length:35731
## 1st Qu.: 89.0 1st Qu.: 1.000 1st Qu.: 0.00 Class :character
## Median : 150.0 Median : 1.000 Median : 0.00 Mode :character
## Mean : 164.3 Mean : 1.728 Mean : 42.35
## 3rd Qu.: 200.0 3rd Qu.: 2.000 3rd Qu.: 50.00
## Max. :7412.0 Max. :36.000 Max. :5552.00
## NA's :11584
## room_type accommodates bathrooms bedrooms
## Entire home/apt:25638 Min. : 1.000 Min. : 0.000 Min. : 0.000
## Hotel room : 173 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000
## Private room : 9143 Median : 4.000 Median : 1.000 Median : 1.000
## Shared room : 777 Mean : 4.184 Mean : 1.671 Mean : 1.623
## 3rd Qu.: 5.000 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :160.000 Max. :20.000 Max. :30.000
## NA's :67 NA's :79
## beds square_feet neighbourhood
## Min. : 0.000 Min. : 0.0 Copacabana : 9262
## 1st Qu.: 1.000 1st Qu.: 0.0 Barra da Tijuca : 6037
## Median : 2.000 Median : 194.0 Ipanema : 3069
## Mean : 2.544 Mean : 525.3 Recreio dos Bandeirantes: 1832
## 3rd Qu.: 3.000 3rd Qu.: 753.0 Botafogo : 1765
## Max. :62.000 Max. :10764.0 (Other) :11874
## NA's :331 NA's :35279 NA's : 1892
## neighbourhood_cleansed neighbourhood_group_cleansed
## Copacabana : 9293 NA's:35731
## Barra da Tijuca : 4023
## Ipanema : 3024
## Jacarepaguá : 2030
## Botafogo : 1762
## Recreio dos Bandeirantes: 1685
## (Other) :13914
## is_location_exact latitude longitude number_of_reviews
## Mode :logical Min. :-23.07 Min. :-43.74 Min. : 0.000
## FALSE:11235 1st Qu.:-22.98 1st Qu.:-43.32 1st Qu.: 0.000
## TRUE :24496 Median :-22.97 Median :-43.20 Median : 1.000
## Mean :-22.97 Mean :-43.25 Mean : 9.634
## 3rd Qu.:-22.95 3rd Qu.:-43.19 3rd Qu.: 6.000
## Max. :-22.75 Max. :-43.10 Max. :406.000
##
## number_of_reviews_ltm review_scores_rating minimum_nights
## Min. : 0.00 Min. : 20.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 93.00 1st Qu.: 1.000
## Median : 0.00 Median : 98.00 Median : 2.000
## Mean : 3.06 Mean : 94.58 Mean : 4.907
## 3rd Qu.: 2.00 3rd Qu.:100.00 3rd Qu.: 4.000
## Max. :105.00 Max. :100.00 Max. :1123.000
## NA's :15873
## maximum_nights is_business_travel_ready
## Min. : 1 Mode :logical
## 1st Qu.: 30 FALSE:35731
## Median : 1125
## Mean : 28655
## 3rd Qu.: 1125
## Max. :999999999
##
## cancellation_policy host_is_superhost host_identity_verified
## flexible :14893 FALSE:31028 Mode :logical
## moderate : 6418 TRUE : 4698 FALSE:26186
## strict : 3 NA's : 5 TRUE :9540
## strict_14_with_grace_period:14191 NA's :5
## super_strict_30 : 88
## super_strict_60 : 138
##
## listing_url
## Length:35731
## Class :character
## Mode :character
##
##
##
##
# Analyzing variable of interest price on its own and when computed and combined with other variables
# Price ?
favstats(~price, data = numeric_listings)
| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 0 | 155 | 300 | 642 | 1.32e+05 | 786 | 2.38e+03 | 35731 | 0 |
favstats(~weekly_price, data = numeric_listings)
| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 110 | 1.12e+03 | 2e+03 | 4.82e+03 | 1.34e+05 | 4.36e+03 | 7.58e+03 | 2396 | 33335 |
# Price per value of certain categorical variables
favstats(price~property_type, data = numeric_listings)
| property_type | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| Aparthotel | 70 | 142 | 184 | 392 | 2.01e+03 | 362 | 401 | 30 | 0 |
| Apartment | 0 | 169 | 302 | 672 | 5.35e+04 | 746 | 1.88e+03 | 27450 | 0 |
| Barn | 529 | 529 | 529 | 529 | 529 | 529 | 1 | 0 | |
| Bed and breakfast | 0 | 91 | 139 | 298 | 1.32e+05 | 2.33e+03 | 1.17e+04 | 159 | 0 |
| Boat | 80 | 1.59e+03 | 3e+03 | 6.5e+03 | 4e+04 | 5.52e+03 | 8.56e+03 | 21 | 0 |
| Boutique hotel | 53 | 294 | 691 | 5e+03 | 1.2e+04 | 3.22e+03 | 3.95e+03 | 35 | 0 |
| Bungalow | 37 | 112 | 150 | 321 | 1e+03 | 270 | 293 | 9 | 0 |
| Cabin | 70 | 416 | 1e+03 | 1.39e+03 | 3.9e+03 | 1.18e+03 | 1.13e+03 | 10 | 0 |
| Camper/RV | 252 | 252 | 252 | 252 | 252 | 252 | 1 | 0 | |
| Campsite | 101 | 476 | 850 | 1.22e+03 | 1.6e+03 | 850 | 1.06e+03 | 2 | 0 |
| Casa particular (Cuba) | 37 | 54.8 | 88.5 | 114 | 2.06e+03 | 408 | 813 | 6 | 0 |
| Castle | 300 | 318 | 336 | 353 | 371 | 336 | 50.2 | 2 | 0 |
| Chalet | 54 | 107 | 163 | 258 | 578 | 199 | 133 | 32 | 0 |
| Condominium | 37 | 150 | 284 | 498 | 5.35e+04 | 532 | 1.77e+03 | 2078 | 0 |
| Cottage | 102 | 224 | 362 | 752 | 6e+03 | 837 | 1.34e+03 | 38 | 0 |
| Earth house | 37 | 83 | 118 | 289 | 3.75e+03 | 317 | 638 | 35 | 0 |
| Farm stay | 64 | 2.27e+03 | 4.47e+03 | 6.68e+03 | 8.88e+03 | 4.47e+03 | 6.23e+03 | 2 | 0 |
| Guest suite | 0 | 104 | 150 | 249 | 5.95e+03 | 264 | 565 | 227 | 0 |
| Guesthouse | 0 | 96 | 142 | 246 | 1.6e+03 | 213 | 248 | 138 | 0 |
| Hostel | 37 | 48 | 72.5 | 131 | 2.5e+04 | 1.67e+03 | 6e+03 | 98 | 0 |
| Hotel | 53 | 187 | 336 | 584 | 2.5e+04 | 1.31e+03 | 3.56e+03 | 114 | 0 |
| House | 0 | 118 | 268 | 854 | 5.35e+04 | 1.19e+03 | 3.5e+03 | 3563 | 0 |
| Houseboat | 252 | 1.88e+03 | 3.5e+03 | 1.67e+04 | 3e+04 | 1.13e+04 | 1.63e+04 | 3 | 0 |
| Hut | 79 | 79 | 79 | 79 | 79 | 79 | 1 | 0 | |
| Island | 150 | 196 | 256 | 400 | 701 | 341 | 248 | 4 | 0 |
| Loft | 37 | 139 | 201 | 344 | 1e+04 | 438 | 860 | 709 | 0 |
| Nature lodge | 54 | 70 | 79 | 101 | 4.28e+03 | 917 | 1.88e+03 | 5 | 0 |
| Other | 37 | 102 | 249 | 600 | 1.07e+04 | 710 | 1.43e+03 | 77 | 0 |
| Serviced apartment | 53 | 182 | 270 | 424 | 1.89e+04 | 423 | 860 | 655 | 0 |
| Tent | 43 | 99 | 155 | 578 | 1e+03 | 400 | 524 | 3 | 0 |
| Tiny house | 37 | 86 | 160 | 252 | 1.62e+03 | 241 | 287 | 42 | 0 |
| Townhouse | 43 | 97.2 | 150 | 260 | 2.81e+03 | 281 | 439 | 62 | 0 |
| Treehouse | 225 | 370 | 514 | 658 | 803 | 514 | 409 | 2 | 0 |
| Vacation home | 2.5e+03 | 2.5e+03 | 2.5e+03 | 2.5e+03 | 2.5e+03 | 2.5e+03 | 1 | 0 | |
| Villa | 43 | 150 | 450 | 2.36e+03 | 1.32e+05 | 3.78e+03 | 1.37e+04 | 115 | 0 |
| Yurt | 171 | 171 | 171 | 171 | 171 | 171 | 1 | 0 |
favstats(price~neighbourhood, data = numeric_listings)
| neighbourhood | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| Abolição | 64 | 150 | 200 | 424 | 1e+03 | 349 | 352 | 6 | 0 |
| Anchieta | 101 | 438 | 699 | 1.42e+03 | 4.82e+03 | 1.33e+03 | 1.68e+03 | 7 | 0 |
| Andaraí | 42 | 80 | 198 | 600 | 1.7e+04 | 746 | 2.05e+03 | 85 | 0 |
| Barra da Tijuca | 0 | 225 | 450 | 1e+03 | 5.35e+04 | 1.11e+03 | 2.81e+03 | 6037 | 0 |
| Barra de Guaratiba | 43 | 187 | 300 | 600 | 4.82e+04 | 1.09e+03 | 5.16e+03 | 87 | 0 |
| Barros Filho | 70 | 78 | 86 | 94 | 102 | 86 | 22.6 | 2 | 0 |
| Benfica | 43 | 101 | 402 | 1e+03 | 3e+03 | 820 | 1.03e+03 | 9 | 0 |
| Bento Ribeiro | 64 | 118 | 177 | 455 | 1.2e+03 | 319 | 300 | 15 | 0 |
| Bonsucesso | 43 | 59 | 72.5 | 114 | 1.5e+03 | 152 | 304 | 22 | 0 |
| Botafogo | 37 | 127 | 219 | 450 | 3.5e+04 | 442 | 1.33e+03 | 1765 | 0 |
| Brás de Pina | 37 | 64 | 80 | 102 | 498 | 108 | 96.2 | 27 | 0 |
| Cachambi | 32 | 102 | 198 | 391 | 1.06e+03 | 289 | 242 | 41 | 0 |
| Caju | 54 | 105 | 134 | 408 | 1.2e+03 | 380 | 547 | 4 | 0 |
| Cascadura | 48 | 96 | 150 | 498 | 3.55e+04 | 2.38e+03 | 8.54e+03 | 17 | 0 |
| Catete | 37 | 128 | 201 | 448 | 8.89e+03 | 892 | 2.09e+03 | 343 | 0 |
| Catumbi | 43 | 101 | 402 | 600 | 669 | 363 | 284 | 5 | 0 |
| Cavalcante | 348 | 348 | 348 | 348 | 348 | 348 | 1 | 0 | |
| Centro | 37 | 101 | 150 | 300 | 2.14e+04 | 665 | 1.91e+03 | 573 | 0 |
| Cidade Nova | 59 | 102 | 150 | 314 | 3e+03 | 331 | 559 | 28 | 0 |
| Coelho Neto | 37 | 42.5 | 48 | 131 | 214 | 99.7 | 99.2 | 3 | 0 |
| Colégio | 70 | 72.2 | 74.5 | 76.8 | 79 | 74.5 | 6.36 | 2 | 0 |
| Complexo da Maré | 150 | 150 | 150 | 150 | 150 | 150 | 1 | 0 | |
| Complexo de Alemão | 37 | 178 | 318 | 459 | 600 | 318 | 398 | 2 | 0 |
| Copacabana | 0 | 159 | 252 | 498 | 1.32e+05 | 657 | 2.31e+03 | 9262 | 0 |
| Cordovil | 48 | 60 | 136 | 250 | 371 | 173 | 151 | 4 | 0 |
| Cosme Velho | 37 | 143 | 300 | 498 | 2e+04 | 573 | 1.88e+03 | 115 | 0 |
| Del Castilho | 48 | 85 | 209 | 440 | 1e+03 | 283 | 240 | 23 | 0 |
| Deodoro | 201 | 339 | 476 | 614 | 752 | 476 | 390 | 2 | 0 |
| Encantado | 70 | 112 | 166 | 257 | 3.18e+03 | 356 | 679 | 21 | 0 |
| Engenheiro Leal | 102 | 451 | 800 | 1.15e+03 | 1.5e+03 | 800 | 988 | 2 | 0 |
| Engenho da Rainha | 53 | 105 | 276 | 466 | 1.61e+03 | 420 | 507 | 8 | 0 |
| Engenho de Dentro | 43 | 153 | 389 | 598 | 2.14e+03 | 480 | 431 | 80 | 0 |
| Engenho Novo | 43 | 70 | 145 | 400 | 2e+03 | 309 | 399 | 51 | 0 |
| Estacio | 37 | 80 | 116 | 209 | 1.8e+03 | 257 | 371 | 89 | 0 |
| Flamengo | 37 | 127 | 225 | 450 | 3.5e+04 | 617 | 1.75e+03 | 941 | 0 |
| Gamboa | 37 | 118 | 198 | 498 | 8.89e+03 | 977 | 2.17e+03 | 33 | 0 |
| Gávea | 54 | 223 | 388 | 916 | 1.07e+04 | 883 | 1.39e+03 | 298 | 0 |
| Glória | 42 | 111 | 180 | 349 | 3e+04 | 452 | 1.8e+03 | 349 | 0 |
| Grajaú | 37 | 102 | 186 | 408 | 8e+03 | 479 | 1.08e+03 | 96 | 0 |
| Grumari | 91 | 194 | 296 | 598 | 899 | 429 | 420 | 3 | 0 |
| Guadalupe | 90 | 155 | 268 | 498 | 1e+03 | 361 | 271 | 13 | 0 |
| Higienópolis | 48 | 269 | 354 | 480 | 1.5e+03 | 436 | 403 | 10 | 0 |
| Honório Gurgel | 64 | 77.5 | 91 | 104 | 118 | 91 | 38.2 | 2 | 0 |
| Humaitá | 42 | 176 | 302 | 596 | 1.5e+04 | 544 | 1.01e+03 | 294 | 0 |
| Inhaúma | 54 | 73.5 | 193 | 326 | 798 | 267 | 262 | 8 | 0 |
| Ipanema | 32 | 238 | 402 | 799 | 4.57e+04 | 895 | 2.05e+03 | 3069 | 0 |
| Irajá | 43 | 59 | 214 | 382 | 1.33e+03 | 316 | 343 | 36 | 0 |
| Jacaré | 80 | 92 | 107 | 148 | 236 | 132 | 70.7 | 4 | 0 |
| Jardim América | 70 | 70 | 70 | 70 | 70 | 70 | 1 | 0 | |
| Jardim Botânico | 37 | 184 | 359 | 799 | 1.5e+04 | 891 | 1.7e+03 | 327 | 0 |
| Joá | 69 | 261 | 1.5e+03 | 4.05e+03 | 2.68e+04 | 4.01e+03 | 6.73e+03 | 100 | 0 |
| Lagoa | 54 | 252 | 498 | 1.1e+03 | 3.14e+04 | 1.08e+03 | 2.17e+03 | 362 | 0 |
| Lapa | 32 | 102 | 171 | 351 | 1.07e+04 | 674 | 1.76e+03 | 512 | 0 |
| Laranjeiras | 37 | 122 | 229 | 498 | 5.35e+04 | 520 | 2.07e+03 | 732 | 0 |
| Leblon | 48 | 286 | 450 | 872 | 2.5e+04 | 884 | 1.48e+03 | 1612 | 0 |
| Leme | 37 | 169 | 296 | 582 | 3.5e+04 | 611 | 1.73e+03 | 656 | 0 |
| Lins de Vasconcelos | 48 | 88.2 | 141 | 243 | 699 | 213 | 191 | 16 | 0 |
| Madureira | 59 | 114 | 161 | 618 | 1.23e+03 | 377 | 381 | 19 | 0 |
| Mangueira | 37 | 42 | 58 | 79 | 90 | 61.2 | 23 | 5 | 0 |
| Manguinhos | 86 | 240 | 394 | 547 | 701 | 394 | 435 | 2 | 0 |
| Maracanã | 43 | 115 | 332 | 1e+03 | 1.06e+04 | 717 | 1.05e+03 | 279 | 0 |
| Marechal Hermes | 59 | 90 | 134 | 241 | 1.8e+03 | 284 | 414 | 19 | 0 |
| Maria da Graça | 37 | 84.8 | 160 | 377 | 1.78e+03 | 398 | 552 | 14 | 0 |
| Méier | 43 | 102 | 201 | 300 | 2.12e+03 | 277 | 304 | 57 | 0 |
| Olaria | 48 | 80 | 98.5 | 306 | 1.34e+03 | 267 | 393 | 10 | 0 |
| Oswaldo Cruz | 69 | 169 | 401 | 498 | 2.14e+03 | 528 | 552 | 13 | 0 |
| Parada de Lucas | 64 | 70 | 80 | 118 | 701 | 207 | 277 | 5 | 0 |
| Parque Anchieta | 54 | 69.5 | 150 | 252 | 2.14e+03 | 381 | 632 | 11 | 0 |
| Parque Colúmbia | 198 | 198 | 198 | 198 | 198 | 198 | 1 | 0 | |
| Pavuna | 134 | 215 | 300 | 384 | 1.07e+03 | 401 | 342 | 6 | 0 |
| Penha | 43 | 70 | 91 | 198 | 1.5e+03 | 193 | 311 | 21 | 0 |
| Penha Circular | 79 | 115 | 184 | 244 | 252 | 175 | 85.3 | 4 | 0 |
| Piedade | 101 | 124 | 170 | 535 | 857 | 310 | 241 | 16 | 0 |
| Pilares | 37 | 64 | 91 | 375 | 584 | 210 | 211 | 9 | 0 |
| Praça da Bandeira | 48 | 102 | 198 | 628 | 5.35e+03 | 471 | 711 | 76 | 0 |
| Quintino Bocaiúva | 37 | 88 | 236 | 750 | 2.65e+03 | 615 | 832 | 11 | 0 |
| Ramos | 48 | 67.2 | 185 | 399 | 2e+04 | 2.5e+03 | 5.96e+03 | 12 | 0 |
| Recreio dos Bandeirantes | 32 | 193 | 353 | 900 | 3.76e+04 | 822 | 1.64e+03 | 1832 | 0 |
| Riachuelo | 53 | 139 | 428 | 648 | 1.61e+03 | 468 | 406 | 25 | 0 |
| Ricardo de Albuquerque | 118 | 357 | 800 | 1.77e+03 | 3.21e+03 | 1.21e+03 | 1.19e+03 | 6 | 0 |
| Rio Comprido | 42 | 102 | 187 | 498 | 2.89e+03 | 370 | 453 | 123 | 0 |
| Rocha | 43 | 110 | 300 | 668 | 2.12e+03 | 463 | 492 | 59 | 0 |
| Rocha Miranda | 101 | 200 | 300 | 549 | 798 | 400 | 359 | 3 | 0 |
| Rocinha | 48 | 71.2 | 107 | 626 | 2.38e+03 | 438 | 616 | 22 | 0 |
| Sampaio | 53 | 86 | 112 | 321 | 1.2e+03 | 260 | 307 | 17 | 0 |
| Santa Teresa | 32 | 102 | 182 | 344 | 5.29e+04 | 575 | 2.52e+03 | 1107 | 0 |
| Santo Cristo | 37 | 87.5 | 225 | 406 | 8.89e+03 | 1.2e+03 | 2.6e+03 | 12 | 0 |
| São Conrado | 43 | 219 | 498 | 1.75e+03 | 5.57e+04 | 2.17e+03 | 5.57e+03 | 248 | 0 |
| São Cristóvão | 37 | 102 | 150 | 434 | 2.9e+03 | 379 | 543 | 123 | 0 |
| Saúde | 48 | 98.5 | 246 | 646 | 3.65e+04 | 2.91e+03 | 7e+03 | 31 | 0 |
| Tijuca | 32 | 102 | 198 | 551 | 3.3e+04 | 546 | 1.52e+03 | 722 | 0 |
| Todos os Santos | 43 | 104 | 198 | 428 | 5.35e+03 | 523 | 969 | 39 | 0 |
| Tomás Coelho | 64 | 70 | 90 | 123 | 1.07e+03 | 230 | 372 | 7 | 0 |
| Urca | 54 | 160 | 289 | 574 | 1e+04 | 720 | 1.25e+03 | 142 | 0 |
| Vaz Lobo | 300 | 650 | 1e+03 | 1.17e+03 | 1.34e+03 | 880 | 530 | 3 | 0 |
| Vidigal | 37 | 102 | 198 | 404 | 2.5e+04 | 1.56e+03 | 5.01e+03 | 176 | 0 |
| Vigário Geral | 59 | 82.5 | 106 | 200 | 294 | 153 | 124 | 3 | 0 |
| Vila da Penha | 59 | 96.5 | 150 | 310 | 1.1e+03 | 239 | 260 | 15 | 0 |
| Vila Isabel | 32 | 102 | 249 | 687 | 1.12e+04 | 567 | 944 | 310 | 0 |
| Vila Kosmos | 64 | 64 | 72 | 82.5 | 90 | 74.5 | 12.8 | 4 | 0 |
| Vincente de Carvalho | 37 | 51 | 900 | 1.6e+03 | 1.78e+03 | 861 | 811 | 7 | 0 |
favstats(price~room_type, data = numeric_listings)
| room_type | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| Entire home/apt | 0 | 201 | 391 | 803 | 1.32e+05 | 957 | 2.54e+03 | 25638 | 0 |
| Hotel room | 0 | 96 | 185 | 397 | 1e+04 | 976 | 2.23e+03 | 173 | 0 |
| Private room | 0 | 101 | 150 | 296 | 1.32e+05 | 328 | 1.79e+03 | 9143 | 0 |
| Shared room | 32 | 59 | 101 | 201 | 2.5e+04 | 477 | 2.39e+03 | 777 | 0 |
# Summary statistics for numeric variables that might influence price
favstats(~cleaning_fee, data = numeric_listings)
| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 0 | 89 | 150 | 200 | 7.41e+03 | 164 | 194 | 24147 | 11584 |
favstats(~review_scores_rating, data = numeric_listings)
| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 20 | 93 | 98 | 100 | 100 | 94.6 | 9.58 | 19858 | 15873 |
For the summary, we can conclude: 1. Price range is around 0 - 132358 USD, with average value 785.9 USD
There are 4 types of room: Entire home/apt, hotel room, private room and shared room. What’s more, entire home/apt represents a significant portion of the total populations of accommodations on Airbnb
The top 5 most popular neighbourhoods in Rio are Copacabana, Barra da Tijuca, Ipanema and Jacarepaguá.
There are 36 different property types but 77% of the accommodations are apartments.
Most of the hosts (87%) in Rio are not super host.
numeric_only_listings <- numeric_listings %>%
# Analyzing the numerical factors of the dataset
select(
host_is_superhost,
accommodates,
bedrooms,
number_of_reviews,
square_feet,
price,
weekly_price,
cleaning_fee,
extra_people,
minimum_nights,
maximum_nights,
number_of_reviews,
review_scores_rating
)
# We analyze the correlation by plotting a matrix
numeric_only_listings %>%
ggcorr( palette = "BlGr", label = TRUE, label_size=3, hjust=1,size=3) +
theme(legend.title = element_text(size = 14))
# It looks like we can reduce the number of variables for the Scatterplot Matrix (corr >=0.2)
numeric_listings %>%
select(price,
square_feet,
weekly_price,
number_of_reviews,
review_scores_rating,
extra_people
) %>%
ggpairs(axisLabels = "show")
As the color graph above displays, we can see that most variables have a weak (if any) correlation.
However, there are certain variables which exhibit a significant stastical correlation, such as Bedrooms & the number of people being accomodated, as well as price & weekly price. This makes sense, as the more bedrooms there are, the more one will be charged on average.
listings <- listings %>%
mutate(price = parse_number(price))
We also noticed that there are many missing values in the data set. There are 35731 NAs for neighbourhood_group_cleansed, 35279 NAs for square_feet, 33070 NAs for monthly_price, 33335 NAs for weekly_price, 15680 NAs for security_deposit, 11584 NAs for cleaning_fee, 15873 NAs for review_scores_rating, 79 NAs for beds, 79 NAs for bedrooms, 67 NAs for bathrooms, 5 NAs for host_is_superhost and 5 NAs for host_identity_verified.
Because NAs is the property not having these information, we will clean NAs equal to zeros.
# cleaning_fee values that are NAs equal to zero
numeric_listings <- numeric_listings %>%
mutate(cleaning_fee = case_when(
is.na(cleaning_fee) ~ 0,
TRUE ~ cleaning_fee)
)
# property_type is a new variable we are creating which contains 5 different categories which includes the top 4 and others
numeric_listings <- numeric_listings %>%
mutate(prop_type_simplified = case_when(
property_type %in% c("Apartment","Condominium", "House","Loft") ~ property_type,
TRUE ~ "Other"
))
The most common value for minimum nights is 1 night. The intended purpose is to provide flexibility to guests.
# minimum_nights values displayed
min_nights <- numeric_listings %>%
count(minimum_nights) %>%
arrange(desc(n))
# minimum_nights displayed in plot
ggplot(numeric_listings, aes(x=minimum_nights)) +
geom_histogram(bins=10) +
labs(y="Count",x="Minimum nights",title="Frequency of minimum_nights") +
xlim(0,50)+
theme_bw()
Filter the airbnb data so that it only includes observations with minimum_nights <= 4
# Filter the data so that it only includes observations with minimum_nights <= 4
numeric_listings <- numeric_listings %>% filter(minimum_nights<=4)
Moving on with our filtered data set, we want to have a mapping of Airbnb listings on Rio.
As seen below, the most properties are located along the waterfront. To no suprise, the further inland one goes the less properties are shown, and naturally there are no properties in the national park.
leaflet(data = filter(numeric_listings)) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
fillColor = "blue",
fillOpacity = 0.4,
popup = ~listing_url,
label = ~property_type)
##Regression Analysis
# Create a new variable called price_4_nights
numeric_listings <- numeric_listings %>%
mutate(price_4_nights=case_when(
guests_included==1 ~ 4*price+cleaning_fee+extra_people,
TRUE ~ 4*price+cleaning_fee))
# density plots to examine the distributions of price_4_nights
numeric_listings %>% ggplot(aes(x=price_4_nights)) +
geom_histogram() +
xlim(0,15000)
# density plots to examine the distributions of log(price_4_nights)
numeric_listings %>% ggplot(aes(x=log(price_4_nights))) +
geom_histogram() +
xlim(0,12) +
NULL
We should use log(price_4_nights) as the variable for the regression analysis as it is easier to be precise and to analyze the data than we the raw price_4_nights
Regression model called model1 with: prop_type_simplified, number_of_reviews, and review_scores_rating
# Creating the regression model
model1 <- lm(log(0.01+(price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating,
data = numeric_listings)
# Display the coefficients
model1 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 6.97 | 0.0615 | 113 | 0 |
| prop_type_simplifiedCondominium | 0.015 | 0.0246 | 0.605 | 0.545 |
| prop_type_simplifiedHouse | -0.271 | 0.0232 | -11.7 | 0 |
| prop_type_simplifiedLoft | -0.132 | 0.0396 | -3.33 | 0.001 |
| prop_type_simplifiedOther | -0.159 | 0.0258 | -6.15 | 0 |
| number_of_reviews | -0.003 | 0.000186 | -13.9 | 0 |
| review_scores_rating | 0.001 | 0.000646 | 1.51 | 0.13 |
# R squared value ?
model1 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0204 | 0.02 | 0.803 | 58.8 | 2.23e-72 | 6 | -2.04e+04 | 4.08e+04 | 4.08e+04 | 1.1e+04 | 16971 | 16978 |
The review_scores_rating coefficient equals to 0.001. Having performed a logarithmic transformation of the dependent variable but not of the independent variables, we can interpret the effect of the coefficients as follows: we take the exponent of the estimate to remove the logarithm, subtract 1 from this result, and then multiply by 100 to obtain the percentage change in the dependent variable following a unit increase in the independent variable.
Thus, for each increase in the variable, the price will increase by [exp(0.001) - 1] * 100 = ~0.1%.
Moving on to property type, we note that the base category in this model is apartments. Hence, the coefficients for the other four property types can be interpreted as follows: - Condominium - coefficient: 0.015. Hence, if the the property is a condo, it adds ~1.51% to the price. - House - coefficient: -0.271. So, an Airbnb of this type has a price that is ~23.7% lower than an apartment. - Loft - coefficient: -0.132. This says that a loft is ~12.4% less expensive than an apartment. - Other - coefficient: -0.147. From this we see that any other property type besides a condo, house or loft costs ~14.7% more compared to an apartment.
In conclusion, we can see that the type of property having the most effect on our model is the “Other” type of property. This could be due to the fact that exotic types of property are more expensive than flats.
Overall, “model 1” has an adjusted R-squared value of 0.020. This means that <2% of the variance is described by this model. This is a very poor performance…
# Creating the regression model
model2 <- lm(log(0.01+(price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type,
data = numeric_listings)
# View the coefficients
model2 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 7.05 | 0.055 | 128 | 0 |
| prop_type_simplifiedCondominium | 0.029 | 0.0219 | 1.33 | 0.184 |
| prop_type_simplifiedHouse | 0.062 | 0.0214 | 2.91 | 0.004 |
| prop_type_simplifiedLoft | -0.262 | 0.0354 | -7.39 | 0 |
| prop_type_simplifiedOther | 0.006 | 0.0244 | 0.252 | 0.801 |
| number_of_reviews | -0.003 | 0.000167 | -20.7 | 0 |
| review_scores_rating | 0.003 | 0.000578 | 4.45 | 0 |
| room_typeHotel room | -0.557 | 0.0703 | -7.92 | 0 |
| room_typePrivate room | -0.806 | 0.0128 | -62.8 | 0 |
| room_typeShared room | -1.1 | 0.042 | -26.2 | 0 |
# check the R squared value
model2 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.219 | 0.219 | 0.717 | 529 | 0 | 9 | -1.84e+04 | 3.69e+04 | 3.7e+04 | 8.73e+03 | 16968 | 16978 |
We can see that room_type has not improved much the models with an adjusted R squared value to ~0.22.This is a huge improvements from the previous model! Bathrooms, bedrooms, beds, or size of the house (accomodates) significant predictors of price_4_nights?
# Bathrooms, bedrooms, beds and accommodates into model2
model3_withbed <- lm(log(0.01+(price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bedrooms +
bathrooms +
beds +
accommodates,
data = numeric_listings)
# View the coefficients
m3_sig_factors_withbed <- model3_withbed %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m3_sig_factors_withbed
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 6.36 | 0.0498 | 128 | 0 |
| prop_type_simplifiedCondominium | -0.049 | 0.0194 | -2.52 | 0.012 |
| prop_type_simplifiedHouse | -0.151 | 0.0192 | -7.87 | 0 |
| prop_type_simplifiedLoft | 0.024 | 0.0316 | 0.748 | 0.455 |
| prop_type_simplifiedOther | 0.027 | 0.0215 | 1.25 | 0.213 |
| number_of_reviews | -0.003 | 0.000147 | -18.7 | 0 |
| review_scores_rating | 0.002 | 0.000511 | 4.09 | 0 |
| room_typeHotel room | -0.356 | 0.062 | -5.74 | 0 |
| room_typePrivate room | -0.522 | 0.0126 | -41.5 | 0 |
| room_typeShared room | -0.812 | 0.038 | -21.3 | 0 |
| bedrooms | 0.216 | 0.00827 | 26.1 | 0 |
| bathrooms | 0.151 | 0.00798 | 19 | 0 |
| beds | -0.032 | 0.00351 | -9.15 | 0 |
| accommodates | 0.043 | 0.00278 | 15.4 | 0 |
# check the R squared value
model3_withbed %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.398 | 0.398 | 0.63 | 858 | 0 | 13 | -1.61e+04 | 3.23e+04 | 3.24e+04 | 6.7e+03 | 16865 | 16879 |
Thanks to these new variables, our Adj. R squared increased to 0.39 which is a lot better than our previous model.
# Remove beds variable and run model3 again
# Add bathrooms, bedrooms, beds and accommodates to model2
model3 <- lm(log((0.01+price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bedrooms +
bathrooms +
accommodates,
data = numeric_listings)
# View the coefficients
m3_sig_factors <- model3 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m3_sig_factors
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 6.35 | 0.0498 | 127 | 0 |
| prop_type_simplifiedCondominium | -0.051 | 0.0194 | -2.64 | 0.008 |
| prop_type_simplifiedHouse | -0.156 | 0.0192 | -8.12 | 0 |
| prop_type_simplifiedLoft | 0.021 | 0.0316 | 0.68 | 0.496 |
| prop_type_simplifiedOther | 0.019 | 0.0215 | 0.895 | 0.371 |
| number_of_reviews | -0.003 | 0.000147 | -18.9 | 0 |
| review_scores_rating | 0.002 | 0.000511 | 4.44 | 0 |
| room_typeHotel room | -0.38 | 0.062 | -6.12 | 0 |
| room_typePrivate room | -0.519 | 0.0126 | -41.3 | 0 |
| room_typeShared room | -0.861 | 0.0373 | -23.1 | 0 |
| bedrooms | 0.198 | 0.00797 | 24.9 | 0 |
| bathrooms | 0.142 | 0.00792 | 17.9 | 0 |
| accommodates | 0.033 | 0.00255 | 12.8 | 0 |
# check the R squared value
model3 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.395 | 0.395 | 0.632 | 922 | 0 | 12 | -1.62e+04 | 3.25e+04 | 3.26e+04 | 6.75e+03 | 16922 | 16935 |
Removing beds, the adjusted R squared slightly decreases, and so we remove “beds”.
To avoid the overfitting in our model, we want to check for multicollinearity to see if we need to leave out any variables.
vif(model3)
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.293877 4 1.032730
## number_of_reviews 1.023532 1 1.011697
## review_scores_rating 1.007104 1 1.003546
## room_type 1.527245 3 1.073128
## bedrooms 2.692697 1 1.640944
## bathrooms 2.255957 1 1.501984
## accommodates 1.893678 1 1.376110
We have to take care of the VIF that are above 5 and if it is above 10, it is alarming. as we don’t have any values above 5, we want to keep all the variables so far.
As we want to see if superhosts command a pricing premium we extend our model. We want to add the variable host_is_superhost as an extra explanatory variable after taking control of the other variables.
# add in host_is_superhost
model4 <- lm(log((0.01+price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bedrooms +
bathrooms +
accommodates +
host_is_superhost,
data = numeric_listings)
# View the coefficients
m4_sig_factors <- model4 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m4_sig_factors
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 6.32 | 0.0501 | 126 | 0 |
| prop_type_simplifiedCondominium | -0.047 | 0.0194 | -2.42 | 0.016 |
| prop_type_simplifiedHouse | -0.154 | 0.0192 | -8.03 | 0 |
| prop_type_simplifiedLoft | 0.024 | 0.0316 | 0.767 | 0.443 |
| prop_type_simplifiedOther | 0.021 | 0.0215 | 0.991 | 0.322 |
| number_of_reviews | -0.003 | 0.000157 | -16.1 | 0 |
| review_scores_rating | 0.003 | 0.000517 | 5.11 | 0 |
| room_typeHotel room | -0.372 | 0.062 | -6.01 | 0 |
| room_typePrivate room | -0.519 | 0.0126 | -41.3 | 0 |
| room_typeShared room | -0.863 | 0.0373 | -23.1 | 0 |
| bedrooms | 0.198 | 0.00796 | 24.9 | 0 |
| bathrooms | 0.142 | 0.00792 | 17.9 | 0 |
| accommodates | 0.033 | 0.00255 | 12.9 | 0 |
| host_is_superhostTRUE | -0.061 | 0.0127 | -4.8 | 0 |
# check the R squared value
model4 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.396 | 0.396 | 0.631 | 854 | 0 | 13 | -1.62e+04 | 3.25e+04 | 3.26e+04 | 6.74e+03 | 16921 | 16935 |
The adjusted R squared increases by ~7.883e-3. We notice that host_is_superhost have a p-value of 0.000 which means that it is extremely significant.
A significant proportion of owners advertise the exact location of their listing (is_location_exact == TRUE), while a significant proportion don’t. Now, we will explore whether a listing’s precise location significantly predicts price_4_nights after once more controlling for other variables.
# add in is_location_exact
model5 <- lm(log((0.01+price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bedrooms +
bathrooms +
accommodates +
host_is_superhost +
is_location_exact,
data = numeric_listings)
# View the coefficients
m5_sig_factors <- model5 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m5_sig_factors
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 6.32 | 0.0509 | 124 | 0 |
| prop_type_simplifiedCondominium | -0.047 | 0.0194 | -2.42 | 0.016 |
| prop_type_simplifiedHouse | -0.155 | 0.0192 | -8.05 | 0 |
| prop_type_simplifiedLoft | 0.024 | 0.0316 | 0.768 | 0.442 |
| prop_type_simplifiedOther | 0.021 | 0.0215 | 0.988 | 0.323 |
| number_of_reviews | -0.003 | 0.000157 | -16.1 | 0 |
| review_scores_rating | 0.003 | 0.000517 | 5.11 | 0 |
| room_typeHotel room | -0.373 | 0.062 | -6.01 | 0 |
| room_typePrivate room | -0.519 | 0.0126 | -41.3 | 0 |
| room_typeShared room | -0.863 | 0.0373 | -23.1 | 0 |
| bedrooms | 0.198 | 0.00796 | 24.9 | 0 |
| bathrooms | 0.142 | 0.00792 | 17.9 | 0 |
| accommodates | 0.033 | 0.00255 | 12.9 | 0 |
| host_is_superhostTRUE | -0.061 | 0.0127 | -4.77 | 0 |
| is_location_exactTRUE | -0.006 | 0.011 | -0.561 | 0.575 |
# check the R squared value
model5 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.396 | 0.396 | 0.631 | 793 | 0 | 14 | -1.62e+04 | 3.25e+04 | 3.26e+04 | 6.74e+03 | 16920 | 16935 |
We didn’t improve the model so we will leave out `is_location_exact’ as a variable
For all cities, there are 3 variables that relate to neighbourhoods: neighbourhood, neighbourhood_cleansed, and neighbourhood_group_cleansed. There are typically more than 20 neighbourhoods in each city, and it wouldn’t make sense to include them all in our model. [.] The new categorical variable that we create for this will be called neighbourhood_simplified.
We refer to our groupings as zones, which are as follows: - Zone 1: Maracanã - Zone 2: Caju, Centro, Catumbi, Cidade Nova, Lapa, Santa, Teresa - Zone 3: Copacabana, Ipanema, Botafogo, Catete, Cosme Velho - Zone 4: Barra de Tijuca
Rio is commonly broken down into 4 zones: North(1), Central(2), South(3) and West(4) and we have grouped accordingly above.
numeric_listings <- numeric_listings %>%
mutate(zones = case_when(neighbourhood_cleansed %in% c("Maracanã") ~ "Zone1",
neighbourhood_cleansed %in% c("Caju, Centro, Catumbi, Cidade Nova, Lapa, Santa
Teresa") ~ "Zone2",
neighbourhood_cleansed %in% c("Copacabana, Ipanema, Botafogo, Catete, Cosme Velho") ~ "Zone3",
neighbourhood_cleansed %in% c("Barra da Tijuca") ~ "Zone4"))
price_neighbourhoods <- numeric_listings %>%
group_by(zones) %>%
summarise(mean= mean(price_4_nights)) %>% # convert to GBP
arrange(desc(mean))
## `summarise()` ungrouping output (override with `.groups` argument)
model6 <- lm(log(0.01+price_4_nights) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bedrooms +
bathrooms +
accommodates +
host_is_superhost +
zones,
data = numeric_listings)
# View the coefficients
m6_sig_factors <- model6 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m6_sig_factors
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 5.86 | 0.16 | 36.6 | 0 |
| prop_type_simplifiedCondominium | -0.117 | 0.0367 | -3.18 | 0.001 |
| prop_type_simplifiedHouse | -0.188 | 0.051 | -3.69 | 0 |
| prop_type_simplifiedLoft | 0.075 | 0.0926 | 0.808 | 0.419 |
| prop_type_simplifiedOther | -0.089 | 0.0455 | -1.96 | 0.05 |
| number_of_reviews | -0.005 | 0.000603 | -8.27 | 0 |
| review_scores_rating | 0.005 | 0.00152 | 3 | 0.003 |
| room_typeHotel room | -0.286 | 0.152 | -1.88 | 0.06 |
| room_typePrivate room | -0.406 | 0.0373 | -10.9 | 0 |
| room_typeShared room | -0.753 | 0.103 | -7.29 | 0 |
| bedrooms | 0.157 | 0.0244 | 6.44 | 0 |
| bathrooms | 0.127 | 0.0198 | 6.39 | 0 |
| accommodates | 0.051 | 0.00857 | 5.97 | 0 |
| host_is_superhostTRUE | -0.092 | 0.0352 | -2.61 | 0.009 |
| zonesZone4 | 0.478 | 0.0693 | 6.89 | 0 |
# check the R squared value
model6 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.468 | 0.464 | 0.56 | 114 | 2.7e-237 | 14 | -1.53e+03 | 3.09e+03 | 3.18e+03 | 570 | 1820 | 1835 |
This model is much better compared to model5 before. Specifically, the adjusted R squared now became 0.4640166. Again, we check for multicollinearity to see if the improvement of the model is wrongly due to the interaction between the explanatory variables.
vif(model6)
## GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.421483 4 1.044943
## number_of_reviews 1.165108 1 1.079401
## review_scores_rating 1.033025 1 1.016378
## room_type 1.710524 3 1.093591
## bedrooms 3.653351 1 1.911374
## bathrooms 2.873010 1 1.694996
## accommodates 2.733020 1 1.653185
## host_is_superhost 1.170336 1 1.081821
## zones 1.059802 1 1.029467
As no VIF scores are anywhere close to 5 we conclude that we still have no multicollinearity in the model, and hence we move forward with this model.
Next, we want to see how the cancellation policy affects the price_4_nights. This variable is a categorical variable with the possible values “moderate”, “flexible”, “strict_14_with_grace_period”, and “super_strict_60”.
model7 <- lm(log((0.01+price_4_nights)) ~
prop_type_simplified +
number_of_reviews +
review_scores_rating +
room_type +
bedrooms +
bathrooms +
accommodates +
host_is_superhost +
zones +
cancellation_policy,
data = numeric_listings)
m7_sig_factors <- model7 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m7_sig_factors
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 5.86 | 0.16 | 36.6 | 0 |
| prop_type_simplifiedCondominium | -0.115 | 0.0368 | -3.12 | 0.002 |
| prop_type_simplifiedHouse | -0.185 | 0.0511 | -3.63 | 0 |
| prop_type_simplifiedLoft | 0.075 | 0.0926 | 0.813 | 0.416 |
| prop_type_simplifiedOther | -0.093 | 0.0456 | -2.03 | 0.042 |
| number_of_reviews | -0.005 | 0.000609 | -8.04 | 0 |
| review_scores_rating | 0.005 | 0.00152 | 3.04 | 0.002 |
| room_typeHotel room | -0.286 | 0.152 | -1.87 | 0.061 |
| room_typePrivate room | -0.408 | 0.0375 | -10.9 | 0 |
| room_typeShared room | -0.754 | 0.103 | -7.29 | 0 |
| bedrooms | 0.154 | 0.0245 | 6.29 | 0 |
| bathrooms | 0.129 | 0.0199 | 6.5 | 0 |
| accommodates | 0.051 | 0.00858 | 5.93 | 0 |
| host_is_superhostTRUE | -0.089 | 0.0353 | -2.51 | 0.012 |
| zonesZone4 | 0.477 | 0.0694 | 6.88 | 0 |
| cancellation_policymoderate | -0.011 | 0.0359 | -0.302 | 0.762 |
| cancellation_policystrict | 0.211 | 0.562 | 0.375 | 0.708 |
| cancellation_policystrict_14_with_grace_period | -0.025 | 0.0317 | -0.777 | 0.437 |
| cancellation_policysuper_strict_60 | 0.3 | 0.159 | 1.89 | 0.059 |
model7 %>% glance()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.469 | 0.464 | 0.56 | 89.3 | 2.94e-234 | 18 | -1.53e+03 | 3.1e+03 | 3.21e+03 | 569 | 1816 | 1835 |
The adjusted R squared improved again.
In the end, we are left with two models to choose between: model6 and model5'. To compare the models side by side, we have created a HUX table. In the table below,model6is to the left, andmodel5` is to the right.
huxreg("model6"=model6,
"model5"=model5,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05,
stars = NULL
) %>%
set_caption('Comparison of models')
| model6 | model5 | |
|---|---|---|
| (Intercept) | 5.857 | 6.324 |
| (0.160) | (0.051) | |
| prop_type_simplifiedCondominium | -0.117 | -0.047 |
| (0.037) | (0.019) | |
| prop_type_simplifiedHouse | -0.188 | -0.155 |
| (0.051) | (0.019) | |
| prop_type_simplifiedLoft | 0.075 | 0.024 |
| (0.093) | (0.032) | |
| prop_type_simplifiedOther | -0.089 | 0.021 |
| (0.046) | (0.022) | |
| number_of_reviews | -0.005 | -0.003 |
| (0.001) | (0.000) | |
| review_scores_rating | 0.005 | 0.003 |
| (0.002) | (0.001) | |
| room_typeHotel room | -0.286 | -0.373 |
| (0.152) | (0.062) | |
| room_typePrivate room | -0.406 | -0.519 |
| (0.037) | (0.013) | |
| room_typeShared room | -0.753 | -0.863 |
| (0.103) | (0.037) | |
| bedrooms | 0.157 | 0.198 |
| (0.024) | (0.008) | |
| bathrooms | 0.127 | 0.142 |
| (0.020) | (0.008) | |
| accommodates | 0.051 | 0.033 |
| (0.009) | (0.003) | |
| host_is_superhostTRUE | -0.092 | -0.061 |
| (0.035) | (0.013) | |
| zonesZone4 | 0.478 | |
| (0.069) | ||
| is_location_exactTRUE | -0.006 | |
| (0.011) | ||
| #observations | 1835 | 16935 |
| R squared | 0.468 | 0.396 |
| Adj. R Squared | 0.464 | 0.396 |
| Residual SE | 0.560 | 0.631 |
To check the model diagnostics we want to use the autoplot(). In order to test the linearity hypothesis, we want to create a horizontal line in the residues in relation to the adjusted plots. The points should ideally be drawn along the dotted line for the qqplot. For scaled plotting, a horizontal line is desirable with points evenly distributed above and below to satisfy the assumption of homogeneity in a linear regression (i.e. the variance of the residuals is constant).
autoplot(model6) +
theme_minimal() +
labs (title = "Model6 Diagnostic Plots")
To conclude our research, we will use a model to predict the price for an Airbnb listing for 2 guest and 4 nights. We use (https://www.airbnb.fr/rooms/12290017?source_impression_id=p3_1603035619_fqRCUYlBbCh3Zya7) that is an apartment with a private room, has at least 10 reviews, and an average rating of at least 90.
We will include the appropriate 95% interval with our prediction and report this and the point prediction in terms of price_4_nights.
library(stats)
library(datasets)
library(prediction)
# Requirements: private room, >=10 reviews, average rating of >=90.
# Filter on requirements and take a random subsample
set.seed(1234)
subset_requirements <- numeric_listings %>%
filter(!is.na(square_feet),
accommodates>=2,
number_of_reviews>=10,
review_scores_rating >= 90,
prop_type_simplified == "Apartment",
room_type == "Private room") %>%
sample_n(size=20) # These will be the observations we want to predict
training_set <- anti_join(numeric_listings, subset_requirements) # This initializes our model
# Run model on the training set
training_model <- lm(log(price_4_nights) ~
prop_type_simplified +
review_scores_rating +
room_type +
bedrooms +
zones +
square_feet,
data = training_set)
# Predict price_4_nights for the subset_requirements
forecast_value <- predict(training_model, newdata = subset_requirements,interval = "confidence")
# Take exponent of fitted values to remove the log transformation
forecast_value <- as.data.frame(forecast_value) %>%
mutate(exp_fit = exp(fit),
exp_lower = exp(lwr),
exp_upper = exp(upr),
real_price = subset_requirements$price_4_nights,
predic_error = real_price - exp_fit)
mean(forecast_value$real_price)
## [1] 826.2
average_prediction <- mean(forecast_value$real_price)
CI_upper <- quantile(forecast_value$real_price, 0.975)
CI_lower <- quantile(forecast_value$real_price, 0.025)
CI_lower
## 2.5%
## 372.35
CI_upper
## 97.5%
## 1919.5
In this project, we wanted to predict the prices of an Airbnb listing for 2 guests during 4 nights. Our final model was based on the highest R squared value, which was model6.